library(tidyverse) ## Always load tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidytext) ## Amazing package for all things text analysis
library(tidymodels) ## Modeling framework
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom 1.0.6 ✔ rsample 1.2.1
## ✔ dials 1.2.1 ✔ tune 1.2.1
## ✔ infer 1.0.7 ✔ workflows 1.1.4
## ✔ modeldata 1.4.0 ✔ workflowsets 1.1.0
## ✔ parsnip 1.2.1 ✔ yardstick 1.3.1
## ✔ recipes 1.0.10
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/
library(prodlim) # When prompted with "Do you want to install from sources the package which needs compilation? (Yes/no/cancel)", type "no" and press "Enter"
library(textrecipes) ## Helpful feature engineering for text data
library(rstudioapi) ## set working directory
library(LiblineaR) ## for SVM models
library(ranger) ## for random forest models
library(xgboost) ## for XGBoost models
##
## Attaching package: 'xgboost'
##
## The following object is masked from 'package:dplyr':
##
## slice
From https://www.kaggle.com/datasets/schmoyote/coffee-reviews-dataset?select=coffee_analysis.csv, I cleaned it up a little bit beforehand
## Set working directory to wherever this script is
setwd(dirname(getActiveDocumentContext()$path))
## Load in csv from that path
coffee <- read.csv("coffee.csv")
Let’s take a look at our data!
## What's our overall distribution of ratings?
coffee %>%
ggplot(aes(rating)) +
geom_histogram(binwidth = 0.999999, color = "black", fill = "#B79990") +
theme_classic()
## Unnest tokens for plots
tidy_coffee <- coffee %>%
unnest_tokens(word, review)
## Basic look at most common words
print(tidy_coffee %>%
count(word, sort = TRUE) %>%
slice_head(n = 50))
## word n
## 1 and 6662
## 2 with 3829
## 3 in 3755
## 4 cup 3339
## 5 sweet 3316
## 6 a 2762
## 7 toned 2709
## 8 the 2453
## 9 of 2452
## 10 aroma 2122
## 11 mouthfeel 2108
## 12 finish 2102
## 13 chocolate 2047
## 14 notes 1916
## 15 acidity 1727
## 16 structure 1701
## 17 fruit 1523
## 18 cocoa 1373
## 19 tart 1262
## 20 richly 1126
## 21 floral 1042
## 22 savory 919
## 23 sweetly 889
## 24 dark 845
## 25 cedar 829
## 26 syrupy 775
## 27 zest 751
## 28 balanced 742
## 29 rich 739
## 30 juicy 733
## 31 bright 711
## 32 nib 702
## 33 chocolaty 699
## 34 by 694
## 35 crisp 693
## 36 smooth 602
## 37 deeply 592
## 38 to 587
## 39 deep 570
## 40 almond 559
## 41 satiny 531
## 42 an 528
## 43 crisply 524
## 44 long 524
## 45 like 522
## 46 flowers 521
## 47 espresso 514
## 48 full 495
## 49 processed 474
## 50 nut 472
## On average, which words are used for higher vs. lower scores
tidy_coffee %>%
## Group data by word
group_by(word) %>%
## Count the number of occurrences and average rating associated with each word
summarise(number_of_uses = n(),
rating = mean(rating)) %>%
## Filter out words that appear fewer than 30 times
filter(number_of_uses >= 30) %>%
## Set up the ggplot with uses on the x-axis and rating on the y-axis
ggplot(aes(number_of_uses, rating)) +
## Add a horizontal line at the overall mean rating
geom_hline(yintercept = mean(coffee$rating), lty = 2, color = "gray50") +
## Add text labels for each word, avoiding overlap
geom_text(aes(label = word), check_overlap = TRUE, vjust = "top", hjust = "left") +
## Use a logarithmic scale for the x-axis
scale_x_log10() +
## Yse classic theme
theme_classic()
A data budget refers to the careful allocation of a dataset into different parts for training, testing, and validation. This ensures that the model can be trained and evaluated effectively, preventing overfitting and ensuring that the model generalizes well to unseen data.
Note: we’ll use cross-validation folds here, but you could always use
bootstraps() instead of vfold_cv() if you
would prefer that approach. (Uses replacement, good for small
datasets)
## Set seed for reproducibility
set.seed(123)
## Split data into training and testing sets
## Stratifying by the 'rating' distribution of ratings in both sets
coffee_split <- initial_split(coffee, strata = rating)
coffee_train <- training(coffee_split)
coffee_test <- testing(coffee_split)
## Set another seed for reproducibility (this is what the experts do, idk why)
set.seed(234)
## Create cross-validation folds from the training set for model evaluation
coffee_folds <- vfold_cv(coffee_train, strata = rating)
## View folds
print(coffee_folds)
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [1411/159]> Fold01
## 2 <split [1412/158]> Fold02
## 3 <split [1412/158]> Fold03
## 4 <split [1412/158]> Fold04
## 5 <split [1413/157]> Fold05
## 6 <split [1413/157]> Fold06
## 7 <split [1413/157]> Fold07
## 8 <split [1414/156]> Fold08
## 9 <split [1415/155]> Fold09
## 10 <split [1415/155]> Fold10
Feature engineering is the process of creating features (i.e., input variables) for you machine learning model. In this context, we are creating features from text data using techniques like tokenization, TF/TF-IDF, and normalization. Later on, we’ll compare these approaches to see which one performs best.
## Recipe 1: tokenizing text and computing TF-IDF features for top 500 words
coffee_rec_tfidf <-
recipe(rating ~ review, data = coffee_train) %>%
step_tokenize(review) %>%
step_tokenfilter(review, max_tokens = 500) %>%
step_tfidf(review)
## Recipe 2: tokenizing text and computing TF features for top 500 words
coffee_rec_tf <-
recipe(rating ~ review, data = coffee_train) %>%
step_tokenize(review) %>%
step_tokenfilter(review, max_tokens = 500) %>%
step_tf(review)
## Recipe 3: tokenizing text, computing TF-IDF features for top 500 words,
## and also removing stopwords and normalizing all predictor variables
coffee_rec_tfidf_stop_norm <-
recipe(rating ~ review, data = coffee_train) %>%
step_tokenize(review) %>%
step_stopwords(review) %>%
step_tokenfilter(review, max_tokens = 500) %>%
step_tfidf(review) %>%
step_normalize(all_predictors())
## Check that these steps are working (not required)
prep(coffee_rec_tfidf) %>% bake(new_data = NULL)